Vdata <- read.csv("../data/videogames.csv")
#Vdata <- Vdata[as.numeric(as.character(Vdata$Year_of_Release)) > 2009, ] # only game after 2010
Vdata <- Vdata[(!is.na(Vdata$Critic_Score)),] # remove missing values
Anova_test <- function(classifier, data = Vdata) {
summary(aov(data$Global_Sales ~ data[[classifier]]))
}
get_freq <- function(x, breaks) {
len <- length(breaks)
res <- numeric(len)
for (i in seq_len(len)) {
res[i] <- sum(breaks[i] < x & x <= breaks[i+1])
}
res
}
get_density <- function(x, breaks, by) {
res <- get_freq(x, breaks) / (length(x) * by)
res
}
get_density_idx <- function(gs) {
ceiling(20 * gs)
}
testclass <- function(classifier, data=Vdata){
table <- tapply(Vdata$Global_Sales, Vdata[[classifier]], function(x){x})
table[[1]] <- NULL
par(mfrow = c(1, 2))
boxplot(table, las = 3, cex=0.2, pch=20)
boxplot(table, ylim=c(0,5), las = 3, cex=0.2,pch=20)
Anova_test(classifier)
}
rangef <- function(x, divider=5) {
x %/% divider * divider + divider/2
}
## Df Sum Sq Mean Sq F value Pr(>F)
## data[[classifier]] 11 190 17.294 5.27 2.37e-08 ***
## Residuals 8125 26662 3.281
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Df Sum Sq Mean Sq F value Pr(>F)
## data[[classifier]] 11 190 17.294 5.27 2.37e-08 ***
## Residuals 8125 26662 3.281
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Df Sum Sq Mean Sq F value Pr(>F)
## data[[classifier]] 303 2447 8.077 2.592 <2e-16 ***
## Residuals 7833 24405 3.116
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Df Sum Sq Mean Sq F value Pr(>F)
## data[[classifier]] 7 248 35.45 10.83 1.14e-13 ***
## Residuals 8129 26604 3.27
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot((Vdata$Critic_Score*Vdata$Critic_Count), ylim=c(0, 5),Vdata$Global_Sales, cex=0.1)
plot(sqrt(Vdata$Critic_Score*Vdata$Critic_Count), ylim=c(0, 5),Vdata$Global_Sales, cex=0.1)
plot(log(Vdata$Critic_Score*Vdata$Critic_Count), ylim=c(0, 5),Vdata$Global_Sales, cex=0.1)
cor.test(as.numeric(as.character(Vdata$User_Score)), Vdata$Critic_Score)
## Warning in cor.test(as.numeric(as.character(Vdata$User_Score)),
## Vdata$Critic_Score): NAs introduced by coercion
##
## Pearson's product-moment correlation
##
## data: as.numeric(as.character(Vdata$User_Score)) and Vdata$Critic_Score
## t = 59.769, df = 7015, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5651609 0.5961733
## sample estimates:
## cor
## 0.5808778
#plot((Vdata$Critic_Score),Vdata$Global_Sales, cex=0.1, ylim=c(0,20))
x <- seq(0,100,len=1000)
y <- 1/1500*x^2
idx <- Vdata$Global_Sales <= (Vdata$Critic_Score)^2/1500
plot((Vdata$Critic_Score)[idx],Vdata$Global_Sales[idx], cex=0.1)
lines(x,y,lty=3,col="red")
plot(factor((Vdata$Critic_Score)[idx]), ylim=c(0,5),Vdata$Global_Sales[idx], cex=0.1)
f <- 1/20000000*x^4+1/1500000000*x^5+0.2
t <- 1/50000000*x^4
m <- 1/80000000*x^4+1/2000000000*x^5+0.1
lines(x,f,lty=2,col="red")
lines(x,t,lty=2,col="blue")
lines(x,m,lty=2,col="green")
# 5 range
Vdata <- Vdata[idx, ]
Vdata <- Vdata[Vdata$Global_Sales <= 4, ]
ran <- rangef(Vdata$Critic_Score)
plot(factor(ran), ylim=c(0,5),Vdata$Global_Sales, cex=0.1)
for (i in seq(11)) {
hist(Vdata$Global_Sales[ran == (32.5+5*i)], breaks=seq(0,6,by=0.05), main =
paste("Sale for Critic Score within ", "[",as.character(32.5+5*i-2.5), ",",
as.character(32.5+5*i+2.5),"]"), xlab="Global Sale", probability = TRUE)
lines(seq(0,4,by=0.05)+0.025, get_density(Vdata$Global_Sales[ran == (32.5+5*i)], seq(0,4,by=0.05), 0.05), col="blue")
legend("topright", "Density Curve", lwd = 0.5, col = "blue")
}
Propose model: \(\hat f(CS, GS) = max(\frac{W_1}{(GS+W_3)^{W_2 \times CS}}+W_4, 0)\)
Objective function: \(\sum(f(CS,GS)-\hat f(CS,GS))^2\)
ran <- rangef(Vdata$Critic_Score)
dataf <- data.frame("Global_Sale" = numeric(0), "Critic_Score" = numeric(0), "Density" = numeric(0))
for (i in seq(11)) {
gs <- Vdata$Global_Sales[ran == (32.5+5*i)]
cs <- Vdata$Critic_Score[ran == (32.5+5*i)]
ccount <- log10(Vdata$Critic_Count[ran == (32.5+5*i)])
ds <- get_density(Vdata$Global_Sales[ran == (32.5+5*i)], seq(0,4,by=0.05), 0.05)
ds_idx <- get_density_idx(gs)
cs_range <- 32.5+5*i
ds <- ds[ds_idx]
dataf <- rbind(dataf, cbind(gs, cs_range,ccount, cs,ds))
}
write.csv(dataf, "CS_GL_FQ.csv")
Pdata <- read.csv('predictedData.csv')
for (i in seq(11)) {
hist(Vdata$Global_Sales[ran == (32.5+5*i)], breaks=seq(0,4,by=0.05), main =
paste("Sale for Critic Score within ", "[",as.character(32.5+5*i-2.5), ",",
as.character(32.5+5*i+2.5),"]"), xlab="Global Sale", probability = TRUE)
lines(seq(0,4,by=0.05)+0.025, get_density(Vdata$Global_Sales[ran == (32.5+5*i)], seq(0,4,by=0.05), 0.05), col="blue")
gs <- Pdata$gs[Pdata$cs_range == (32.5+5*i)]
y <- Pdata[[6]][Pdata$cs_range == (32.5+5*i)]
points(gs, y, col='red', pch=19,cex=0.5)
legend("topright", c("Density Curve", "Predicted By Neural Network"), lwd = c(0.5, NA), col = c("blue", 'red'), pch=c(NA, 19))
}
Pdata <- read.csv('predictedData2.csv')
for (i in seq(11)) {
hist(Vdata$Global_Sales[ran == (32.5+5*i)], breaks=seq(0,4,by=0.05), main =
paste("Sale for Critic Score within ", "[",as.character(32.5+5*i-2.5), ",",
as.character(32.5+5*i+2.5),"]"), xlab="Global Sale", probability = TRUE)
lines(seq(0,4,by=0.05)+0.025, get_density(Vdata$Global_Sales[ran == (32.5+5*i)], seq(0,4,by=0.05), 0.05), col="blue")
gs <- Pdata$gs[Pdata$cs_range == (32.5+5*i)]
y <- Pdata[[6]][Pdata$cs_range == (32.5+5*i)]
points(gs, y, col='red', pch=19,cex=0.5)
legend("topright", c("Density Curve", "Predicted By Neural Network"), lwd = c(0.5, NA), col = c("blue", 'red'), pch=c(NA, 19))
}